In [1]:
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt
from pandas.tools.plotting import scatter_matrix
%matplotlib inline

In [5]:
data = pd.read_csv("../../Data/WeatherOutagesAll_RK.csv")

In [8]:
data.head()


Out[8]:
Total_outages Day_length_hr Avg_Temp_F Avg_humidity_percent Max_windspeed_mph Avg_windspeed_mph Max_windgust_mph Precipitation_in outage_class
0 0 12.783333 58 89 8 2 9 0.01 0.0
1 2 12.716667 62 69 8 4 9 0.00 0.0
2 1 12.666667 64 74 16 7 25 0.00 0.0
3 0 12.616667 71 66 17 12 9 0.00 0.0
4 1 12.550000 66 79 12 5 9 0.00 0.0

In [7]:
for i in range(data.shape[0]):
    if data.iloc[i].Total_outages < 3:
        data.loc[i, "outage_class"] = int(0)
    elif data.iloc[i].Total_outages > 7:
        data.loc[i, "outage_class"] = int(2)
    else:
        data.loc[i, "outage_class"] = int(1)

In [34]:
scaler = StandardScaler()

In [35]:
#grid search
xTrain = None
yTrain = None
xTest = None
yTest = None
df = None

for train, test in splitter.split(data):
    xTrain = scaler.fit_transform(data.iloc[train, 1:-1])
    yTrain = data.iloc[train, -1]
    xTest = scaler.transform(data.iloc[test, 1:-1])
    yTest = data.iloc[test, -1]
    cRange = np.logspace(-3, 3, 7)
    gammaRange = np.logspace(-3, 3, 7)
    paramGrid = dict(gamma=gammaRange, C=cRange)
    grid = GridSearchCV(SVC(cache_size=1000.0, verbose=1, class_weight="balanced"),
                        param_grid=paramGrid, cv=KFold(n_splits=5, shuffle=True), verbose=True)
    grid.fit(xTrain, yTrain)
    print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))
    df = pd.DataFrame(grid.cv_results_)
    df.to_csv("resultsRBF1.csv")


Fitting 5 folds for each of 49 candidates, totalling 245 fits
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]
[Parallel(n_jobs=1)]: Done 245 out of 245 | elapsed:  6.8min finished
[LibSVM]The best parameters are {'C': 1.0, 'gamma': 100.0} with a score of 0.88

Need a metric to correctly predict class 1 and 2


In [65]:
xTrain = None
yTrain = None
xTest = None
yTest = None
nSplits = 10
shuffleSplitter = ShuffleSplit(n_splits=nSplits, test_size=0.2)
classifier = SVC(C=100, gamma=0.1, cache_size=1000.0, class_weight="balanced", probability=True)

df = pd.DataFrame()
for cValue in np.logspace(-5, 5, 11):
    for gammaValue in np.logspace(-5, 5, 11):
        classifier.set_params(C = cValue, gamma = gammaValue)
        allCorrectness = []
        allFalseNegativesBad = []
        allFalseNegativesExtreme = []
        allFalsePositives = []
        allBadDays = []
        allExtremeDays = []
        
        for train, test in shuffleSplitter.split(data):
            overallCorrectness = 0
            falseNegativesBad = 0
            falseNegativesExtreme = 0
            falsePositives = 0
            badDays = 0
            extremeDays = 0
            xTrain = scaler.fit_transform(data.iloc[train, 1:-1])
            yTrain = data.iloc[train, -1]
            xTest = scaler.transform(data.iloc[test, 1:-1])
            yTest = data.iloc[test, -1]
            classifier.fit(xTrain, yTrain)
            yPredict = classifier.predict(xTest)
            yTest = yTest.tolist()
            yPredict = yPredict.tolist()
            for i in range(len(yTest)):
                if yTest[i] == 1:
                    badDays += 1
                elif yTest[i] == 2:
                    extremeDays += 1
                if yTest[i] == yPredict[i]:
                    overallCorrectness += 1
                elif yTest[i] < yPredict[i]:
                    falsePositives += 1
                elif yTest[i] == 1 and yPredict[i] == 0:
                    falseNegativesBad += 1
                else: # yTest[i] == 2 and yPredict[i] != 2
                    falseNegativesExtreme += 1
            allCorrectness.append(overallCorrectness / len(yTest))
            allBadDays.append(badDays)
            allExtremeDays.append(extremeDays)
            if badDays != 0:
                allFalseNegativesBad.append(falseNegativesBad / badDays)
                allFalsePositives.append(falsePositives / (badDays + extremeDays))
            else:
                allFalseNegativesBad.append(0)
                allFalsePositives.append(0)
            if extremeDays != 0:
                allFalseNegativesExtreme.append(falseNegativesExtreme / extremeDays)
            else:
                allFalseNegativesExtreme.append(0)
            
        df = df.append({"C": cValue, "gamma": gammaValue, "Overall_Correctness": np.mean(allCorrectness),
                         "False_Negatives_Extreme": np.mean(allFalseNegativesExtreme),
                        "False_Negatives_Bad": np.mean(allFalseNegativesBad),
                        "False_Positives": np.mean(allFalsePositives),
                       "Bad_Days": np.mean(allbadDays),
                        "Extreme_Days": np.mean(allExtremeDays)}, ignore_index = True)
df.to_csv("gridSearch2.csv")


Out[65]:
Bad_Days C Extreme_Days False_Negatives_Bad False_Negatives_Extreme False_Positives Overall_Correctness gamma
0 135.2 0.00001 14.1 0.300000 0.500000 5.745204 0.293027 0.00001
1 135.2 0.00001 13.7 0.600000 0.800000 3.308781 0.552162 0.00010
2 135.2 0.00001 14.4 0.500000 0.900000 3.901914 0.481995 0.00100
3 135.2 0.00001 14.5 0.300000 0.800000 5.427809 0.323036 0.01000
4 135.2 0.00001 13.3 0.800000 1.000000 1.530679 0.728332 0.10000
5 135.2 0.00001 13.3 0.600000 0.700000 3.746154 0.542277 1.00000
6 135.2 0.00001 13.8 0.600000 0.800000 3.387290 0.552780 10.00000
7 135.2 0.00001 13.7 0.400000 0.700000 4.776159 0.390733 100.00000
8 135.2 0.00001 11.9 0.900000 1.000000 0.686806 0.806708 1000.00000
9 135.2 0.00001 14.1 0.700000 0.900000 2.389249 0.641483 10000.00000
10 135.2 0.00001 14.7 0.600000 0.700000 3.036525 0.546778 100000.00000
11 135.2 0.00010 12.7 0.500000 0.700000 4.216205 0.469550 0.00001
12 135.2 0.00010 13.9 0.300000 0.600000 5.645703 0.305296 0.00010
13 135.2 0.00010 15.5 0.200000 0.500000 6.603053 0.213504 0.00100
14 135.2 0.00010 12.5 0.400000 0.800000 5.096034 0.392939 0.01000
15 135.2 0.00010 13.2 0.300000 0.600000 5.634613 0.301942 0.10000
16 135.2 0.00010 12.5 0.400000 0.900000 4.947746 0.405296 1.00000
17 135.2 0.00010 13.9 0.300000 0.500000 6.070497 0.290556 10.00000
18 135.2 0.00010 13.1 0.300000 0.800000 5.696443 0.318447 100.00000
19 135.2 0.00010 14.6 0.600000 0.700000 3.559915 0.545984 1000.00000
20 135.2 0.00010 14.2 0.500000 0.700000 4.005318 0.468226 10000.00000
21 135.2 0.00010 13.3 0.200000 0.800000 6.460321 0.239011 100000.00000
22 135.2 0.00100 14.6 0.400000 0.780000 4.508929 0.400177 0.00001
23 135.2 0.00100 14.5 0.497479 0.758333 4.025406 0.474404 0.00010
24 135.2 0.00100 13.7 0.200000 0.400000 6.673008 0.204325 0.00100
25 135.2 0.00100 14.1 0.300000 0.700000 5.660267 0.310591 0.01000
26 135.2 0.00100 15.1 0.597541 0.588235 3.429097 0.532215 0.10000
27 135.2 0.00100 15.3 0.500000 0.800000 3.885627 0.477846 1.00000
28 135.2 0.00100 12.9 0.200000 0.400000 6.775439 0.206796 10.00000
29 135.2 0.00100 13.0 0.500000 0.700000 4.103879 0.464166 100.00000
... ... ... ... ... ... ... ... ...
91 135.2 1000.00000 12.6 0.363103 0.694678 2.108924 0.724184 0.01000
92 135.2 1000.00000 13.8 0.501265 0.864293 1.597682 0.759135 0.10000
93 135.2 1000.00000 13.5 0.780459 0.976975 0.719612 0.825419 1.00000
94 135.2 1000.00000 13.4 0.963397 1.000000 0.065507 0.881112 10.00000
95 135.2 1000.00000 15.2 1.000000 1.000000 0.000676 0.881465 100.00000
96 135.2 1000.00000 15.0 1.000000 1.000000 0.000000 0.889144 1000.00000
97 135.2 1000.00000 14.7 1.000000 1.000000 0.000000 0.885260 10000.00000
98 135.2 1000.00000 15.2 1.000000 1.000000 0.000000 0.887467 100000.00000
99 135.2 10000.00000 12.4 0.262630 0.418801 3.214976 0.611121 0.00001
100 135.2 10000.00000 14.2 0.242541 0.359097 2.709883 0.654898 0.00010
101 135.2 10000.00000 14.3 0.317330 0.483247 2.179486 0.696911 0.00100
102 135.2 10000.00000 14.3 0.391274 0.842637 2.010767 0.721801 0.01000
103 135.2 10000.00000 13.9 0.561546 0.866655 1.396852 0.769638 0.10000
104 135.2 10000.00000 13.9 0.781261 0.934774 0.695464 0.829038 1.00000
105 135.2 10000.00000 13.3 0.973954 1.000000 0.061310 0.879965 10.00000
106 135.2 10000.00000 13.4 1.000000 1.000000 0.001637 0.888703 100.00000
107 135.2 10000.00000 13.0 1.000000 1.000000 0.000000 0.881024 1000.00000
108 135.2 10000.00000 13.9 1.000000 1.000000 0.000000 0.884643 10000.00000
109 135.2 10000.00000 15.2 1.000000 1.000000 0.000000 0.887643 100000.00000
110 135.2 100000.00000 13.0 0.227426 0.339827 3.218824 0.618447 0.00001
111 135.2 100000.00000 15.3 0.271505 0.466130 2.357395 0.700088 0.00010
112 135.2 100000.00000 13.3 0.318043 0.666409 2.150083 0.713416 0.00100
113 135.2 100000.00000 14.3 0.406561 0.787130 1.701937 0.742454 0.01000
114 135.2 100000.00000 14.5 0.605555 0.867419 1.418814 0.775816 0.10000
115 135.2 100000.00000 14.7 0.785395 0.933780 0.690131 0.826125 1.00000
116 135.2 100000.00000 13.1 0.969717 1.000000 0.072606 0.876964 10.00000
117 135.2 100000.00000 14.8 1.000000 1.000000 0.001492 0.880141 100.00000
118 135.2 100000.00000 14.3 1.000000 1.000000 0.000000 0.886055 1000.00000
119 135.2 100000.00000 14.0 1.000000 1.000000 0.000000 0.886849 10000.00000
120 135.2 100000.00000 15.2 1.000000 1.000000 0.000000 0.881553 100000.00000

121 rows × 8 columns


In [56]:
xTrain = None
yTrain = None
scaler = StandardScaler()
classifier = SVC(C=10, gamma=0.01, class_weight="balanced", probability=True)

xTrain = scaler.fit_transform(data.iloc[:, 1:-1])
yTrain = data.iloc[:, -1]
classifier.fit(xTrain, yTrain)


Out[56]:
SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [82]:
from sklearn.externals import joblib
joblib.dump(classifier, 'SVCmodel.pkl')
joblib.dump(scaler, 'scaler.pkl')


Out[82]:
['scaler.pkl']

In [12]:
from sklearn.externals import joblib
classifier = joblib.load("SVCmodel.pkl")
scaler = joblib.load("scaler.pkl")
bar = np.array([[12,70,80,8,5,10,0.]])
bar = bar.reshape(1,-1)
bar2 = scaler.transform(bar)
print(classifier.predict(bar2))
classifier.predict_proba(bar2)


[ 0.]
Out[12]:
array([[ 0.96614198,  0.03266029,  0.00119773]])

In [13]:
def predictOutage(weatherData):
    if weatherData.shape[1] != 7:
        raise ValueError("7 features are required. See documentation.")
    model = joblib.load("SVCmodel.pkl")
    scaler = joblib.load("scaler.pkl")
    scaledData = scaler.transform(weatherData)
    return model.predict_proba(weatherData)

In [7]:
from sklearn.externals import joblib
bar = np.array([[12,70,80,8,5,10,0.]])
def predictOutageProba(weatherData):
    if weatherData.shape[1] != 7:
        raise ValueError("7 features are required. See documentation.")
    model = joblib.load("SVCmodel.pkl")
    scaler = joblib.load("scaler.pkl")
    scaledData = scaler.transform(weatherData)
    return model.predict_proba(scaledData)
predictOutageProba(bar)


Out[7]:
array([[ 0.96614198,  0.03266029,  0.00119773]])

Plot data distribution


In [101]:
plt.hist(data.Total_outages, bins=50)
plt.show()



In [ ]: